**** This dofile is to clean the dataset of grade 10 students at endline

clear all
capture log close
set more off
set varabbrev off

log using "$logfile/build_grade10students_endline.log", replace

// Data as of March 31
gl endlinecsv $raw/student/grade910/20220404174118-SurveyExport.csv
clear
import delimited "$endlinecsv", clear

* remove invalid responses
drop if thissurveyisintendedforgradexstu=="No, I am not a Grade X student."

* rename personal information variables
cap ren whatisyourname name_original_e
cap ren whatisyourgender sex
recode sex (2=0)
lab define sex 1 "Male" 0 "Female"
lab val sex sex 
cap ren whatisyourcitizenshipidcode11dig citizenid
format citizenid %15s
replace citizenid = subinstr(citizenid,".","",.)  
replace citizenid = subinstr(citizenid,"]","",.)  
replace citizenid = subinstr(citizenid,"/","",.) 
replace citizenid = trim(itrim(lower(citizenid))) 
replace citizenid = subinstr(citizenid,"na","",.) 
format citizenid %11s
replace citizenid ="" if inlist(citizenid, "00000000000","12345678910", "01234567890", "12345678902", "12345678901", "12345678912", "12345678911")
replace citizenid ="" if strlen(citizenid)~=11
cap ren inwhichdzongkhagisyourschoolloca district
cap ren dateofbirthdaywhatisyourdateofbi b_day
cap ren dateofbirthmonthwhatisyourdateof b_month
cap ren dateofbirthyearwhatisyourdateofb b_year
cap ren whatisyourstudentidcode studentid
cap ren whatisyouremailaddress email
cap ren whatisyourmobilephonenumber phone
replace district = strlower(district)

* format name
replace name_original_e=lower(name_original_e)
replace name_original_e = subinstr(name_original_e,"@gemailcom","",.)   //Removes irrelevant info
replace name_original_e = subinstr(name_original_e,"@gmailcom","",.)   //Removes irrelevant info
g name = name_original_e
replace name = trim(itrim(lower(name)))
replace name = subinstr(name," ","",.)  
replace name = subinstr(name,",","",.)   //Removes comma (,)
replace name = subinstr(name,"'","",.)   //Removes apostrophe (')
replace name = subinstr(name,".","",.)   //Removes dot (.) 
replace name = subinstr(name,"/","",.)   //Removes slash (/)
replace name = subinstr(name,"-","",.)   //Removes dash (-)
replace name = subinstr(name,"=","",.)   //Removes dash (-)
replace name = subinstr(name,"(","",.)   //Removes opening parentheses
replace name = subinstr(name,")","",.)   //Removes closing parentheses
replace name = subinstr(name,"mynameis","",.)   //Removes irrelevant info
replace name = subinstr(name,"@gmailcom","",.)   //Removes irrelevant info
replace name = subinstr(name,"@gemailcom","",.)   //Removes irrelevant info
format name %25s
format name_original_e %30s

* format studentid 
replace studentid = trim(itrim(lower(studentid)))
replace studentid = subinstr(studentid,"..",".",.)  
replace studentid = subinstr(studentid," ","",.)  
replace studentid = subinstr(studentid,"@education.gov.bt","",.)   //Removes irrelevant info
replace studentid = subinstr(studentid,"@education.gov. t","",.) 
replace studentid = subinstr(studentid,"ation.gov.bt","",.) 
replace studentid = subinstr(studentid,"@educ","",.) 
replace studentid = subinstr(studentid,"@.educ","",.) 
* br responseid studentid if strlen(studentid)~=17 & ~missing(studentid)
replace studentid="" if inlist(responseid, 1099, 1139, 2126, 7133, 7626) | studentid=="na"

* create a variable of school name
rename whatisthenameofyourschool schoolname
forval i=33/42 {
	replace schoolname = v`i' if missing(schoolname) & ~missing(v`i')
}	
replace schoolname = strlower(schoolname)

* for partial submission: keep reponses with sufficient information to determine students 
lab var status "survey status"
keep if status=="Complete" | ///
       (status=="Partial" & ///
	   (~((missing(name_original_e) | strlen(name_original_e)<=2) & missing(studentid)) | ///
	    ~((missing(name_original_e) | strlen(name_original_e)<=2) & missing(sex) & missing(b_month) & missing(b_year) & missing(b_day))))

* generate variables for treatment reports in the endline surveyed
g treat_claim = didyouparticipateinthementorship==1
lab var treat_claim "students claimed to have been participating in the program"

rename howmanytimesdidyoumeetyourmentor treat_meet_person
rename v121 treat_meet_online

rename overallsatisfactionwiththementor treat_satisfaction_mentor 
rename overallsatisfactionwiththeconten treat_satisfaction_program 

rename howlikelythatyouwillstayincontac treat_contact

rename doyouknowthatsomestudentsinyours untreated_tschool
recode untreated_tschool (2=0)
rename haveyoutalkedwithanyofyourclassm untreated_talk
rename ifthementoringhadbeenavailableto untreated_interest

rename didyouhearaboutthestudentsmentor control_cschool
recode control_cschool (2=0)
rename haveyoutalkedwithanyonewhoreceiv control_talk
rename v132 control_interest

recode untreated_interest control_interest (2=0)

* KEEP ONLY RELEVANT INFORMATION FOR NOW
keep responseid status schoolname name* sex citizenid district b_* studentid email* phone  treat_* control* untreated* status
replace phone = strlower(phone) 
replace phone = subinstr(phone,"na","",.)  
rename phone phone_e

rename b_month month
g b_month=.
replace b_month=1 if month=="January"
replace b_month=2 if month=="February"
replace b_month=3 if month=="March"
replace b_month=4 if month=="April"
replace b_month=5 if month=="May"
replace b_month=6 if month=="June"
replace b_month=7 if month=="July"
replace b_month=8 if month=="August"
replace b_month=9 if month=="September"
replace b_month=10 if month=="October"
replace b_month=11 if month=="November"
replace b_month=12 if month=="December"
drop month

g bday_e = b_year*10^4 + b_month*10^2 + b_day
tostring bday_e, replace 

// Check for duplicated and invalid studentid 
duplicates tag studentid, g(dup_studentid)
sort studentid status responseid

* duplicated answers, keep the complete (earlier) submission
forval i=1/3{
drop if name==name[_n-1] & studentid==studentid[_n-1] & citizenid==citizenid[_n-1] & dup_studentid~=0
drop if name==name[_n-1] & b_day==b_day[_n-1] & b_month==b_month[_n-1] & b_year==b_year[_n-1] & sex==sex[_n-1] &  dup_studentid~=0 
}

drop dup_studentid
duplicates tag studentid, g(dup_studentid)

* for invalid studentID and citizenID: attempt to recover studentid later on
sort studentid responseid
rename responseid responseid_e

sort responseid_e 
browse responseid_e schoolname
rename status status_e

preserve 
keep if inlist(studentid, "000.00000.00.0000", "111.22222.33.4444") | strlen(studentid)~=17 | (missing(citizenid) & dup_studentid~=0)
drop dup_studentid
save "$temp/g10_p0e3.dta", replace 
restore 

drop if inlist(studentid, "000.00000.00.0000", "111.22222.33.4444") | strlen(studentid)~=17 | (missing(citizenid) & dup_studentid~=0)

* save as two datasets: with unique ID and non-unique ID
preserve 
keep if dup_studentid==0
drop dup_studentid
save "$temp/g10_p0e1.dta", replace
restore 

* save those without unique id 
keep if dup_studentid~=0
sort studentid citizenid status_e
drop if name==name[_n-1] & studentid==studentid[_n-1] & citizenid==citizenid[_n-1] 
drop if b_day==b_day[_n-1] & b_month==b_month[_n-1] & b_year==b_year[_n-1] & sex==sex[_n-1] & studentid==studentid[_n-1] 
drop dup_studentid
duplicates tag studentid, g(dup_studentid)

preserve 
keep if dup_studentid==0
drop dup_studentid
append using "$temp/g10_p0e1.dta"
save "$temp/g10_p0e1.dta", replace 
restore 

drop if dup_studentid==0
save "$temp/g10_p0e2.dta", replace

